# Computations
import numpy as np
import pandas as pd
import scipy.stats as stats
# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, KFold, StratifiedShuffleSplit
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFE
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use Kaggle'sPima Indians Diabetes. The Pima Indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
| Feature | Explanations |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| Blood Pressure | Diastolic blood pressure (mm Hg) |
| Skin Thickness | Triceps skinfold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| Diabetes Pedigree Function | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Whether or not a patient has diabetes |
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Data = pd.read_csv('pima-indians-diabetes-database/diabetes_STD.csv')
Header('Standardized Dataset:')
display(Data.head())
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
Standardized Dataset: ==============================================================================
| Pregnancies | Glucose | Blood Pressure | Skin Thickness | Insulin | BMI | Diabetes Pedigree Function | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.639947 | 0.848324 | 0.149641 | 0.907270 | -0.692891 | 0.204013 | 0.468492 | 1.425995 | 1 |
| 1 | -0.844885 | -1.123396 | -0.160546 | 0.530902 | -0.692891 | -0.684422 | -0.365061 | -0.190672 | 0 |
| 2 | 1.233880 | 1.943724 | -0.263941 | -1.288212 | -0.692891 | -1.103255 | 0.604397 | -0.105584 | 1 |
| 3 | -0.844885 | -0.998208 | -0.160546 | 0.154533 | 0.123302 | -0.494043 | -0.920763 | -1.041549 | 0 |
| 4 | -1.141852 | 0.504055 | -1.504687 | 0.907270 | 0.765836 | 1.409746 | 5.484909 | -0.020496 | 1 |
| Number of Instances | Number of Attributes |
|---|---|
| 768 | 9 |
Target = 'Outcome'
X = Data.drop(columns = [Target])
y = Data[Target]
Labels_dict = dict(zip([0, 1], ['Non-Diabetic', 'Diabetic']))
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Pull = [0 for x in range((len(Labels_dict)-1))]
Pull.append(.05)
PD = dict(PieColors = ['SeaGreen','FireBrick'],
TableColors = ['Navy','White'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 350, tablecolumnwidth = [0.20, 0.12, 0.15],
pull = Pull, legend_title = Target, title_x = 0.5, title_y = 0.8)
del Pull
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, PD, Labels_dict = Labels_dict):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= PD['column_widths'],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [ToSeries(y_train).replace(Labels_dict), ToSeries(y_test).replace(Labels_dict)]:
fig.add_trace(go.Pie(labels= list(Labels_dict.values()),
values= y.value_counts().values, pull=PD['pull'],
textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'],
line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole=.5)
fig.update_layout(legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + 'Dataset Distribution' + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD.update(dict(column_widths=[0.3, 0.3, 0.3], tablecolumnwidth = [0.2, 0.4], height = 350, legend_title = Target))
Train_Test_Dist(X_train, y_train, X_test, y_test, PD)
Gradient Boosting Classifier (GBC)optimizes a model in several stages using differentiable loss function. See sklearn.ensemble.GradientBoostingClassifier for more details.
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
def Best_Parm(model, param_dist, Top = None, X = X, y = y, n_splits = 20, scoring = 'precision', H = 600, titleY = .95):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=Test_Size, random_state=42),
n_iter = int(1e3), scoring = scoring, error_score = 0, verbose = 0,
n_jobs = 10, return_train_score = True)
_ = grid.fit(X, y)
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
Table = Table.iloc[:Top,:]
# Table
T = Table.copy()
T['Train Score'] = T['Mean Train Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Train Score'].map(lambda x: ('%.2e' % x))
T['Test Score'] = T['Mean Test Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Test Score'].map(lambda x: ('%.2e' % x))
T['Fit Time'] = T['Mean Fit Time'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Fit Time'].map(lambda x: ('%.2e' % x))
T = T.drop(columns = ['Mean Train Score','STD Train Score','Mean Test Score','STD Test Score','Mean Fit Time','STD Fit Time'])
display(T.head(Top).style.hide_index().background_gradient(subset= ['Rank Test Score'],
cmap=sns.diverging_palette(145, 300, s=60, as_cmap=True)).\
set_properties(subset=['Params'], **{'background-color': 'Indigo', 'color': 'White'}).\
set_properties(subset=['Train Score'], **{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Test Score'], **{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Fit Time'], **{'background-color': 'Linen', 'color': 'Black'}))
# Plot
Grid_Performance_Plot(Table, n_splits = n_splits, H = H, titleY = titleY)
return grid
def Grid_Table(grid):
Table = pd.DataFrame({'Rank Test Score': grid.cv_results_['rank_test_score'],
'Params':[str(s).replace('{', '').replace('}', '').\
replace("'", '') for s in grid.cv_results_['params']],
# Train
'Mean Train Score': grid.cv_results_['mean_train_score'],
'STD Train Score': grid.cv_results_['std_train_score'],
# Test
'Mean Test Score': grid.cv_results_['mean_test_score'],
'STD Test Score': grid.cv_results_['std_test_score'],
# Fit time
'Mean Fit Time': grid.cv_results_['mean_fit_time'],
'STD Fit Time': grid.cv_results_['std_fit_time']})
Table = Table.sort_values('Rank Test Score').reset_index(drop = True)
return Table
def Grid_Performance_Plot(Table, n_splits, H = 550, titleY =.95):
Temp = Table['Mean Train Score']-Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']-Table['STD Test Score'])
L = np.floor((Temp*100- Temp)).min()/100
Temp = Table['Mean Train Score']+Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']+Table['STD Test Score'])
R = np.ceil((Temp*100 + Temp)).max()/100
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('<b>' + 'Train Set' + '<b>', '<b>' + 'Test Set' + '<b>'))
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Train Score'], showlegend=False, marker_color= 'SeaGreen',
error_y=dict(type='data',array=Table['STD Train Score'], visible=True)), 1, 1)
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Test Score'], showlegend=False, marker_color= 'RoyalBlue',
error_y=dict(type='data',array= Table['STD Test Score'], visible=True)), 1, 2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= [L, R])
fig.update_yaxes(title_text="Mean Score", row=1, col=1)
fig.update_layout(plot_bgcolor= 'white', width = 980, height = H,
title={'text': '<b>' + 'RandomizedSearchCV with %i-fold cross validation' % n_splits + '<b>',
'x':0.5, 'y':titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def Stratified_CV_Scoring(model, X = X, y = y, n_splits = 10, Labels = list(Labels_dict.values())):
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.Series):
y = y.values
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
_ = model.fit(X_train,y_train)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
return Reports_Train, Reports_Test, CM_Train, CM_Test
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(PD['Labels'])
_ = a.yaxis.set_ticklabels(PD['Labels'])
_ = a.set_aspect(1)
Some of the metrics that we use here to mesure the accuracy: \begin{align} \text{Confusion Matrix} = \begin{bmatrix}T_p & F_p\\ F_n & T_n\end{bmatrix}. \end{align}
where $T_p$, $T_n$, $F_p$, and $F_n$ represent true positive, true negative, false positive, and false negative, respectively.
\begin{align} \text{Precision} &= \frac{T_{p}}{T_{p} + F_{p}},\\ \text{Recall} &= \frac{T_{p}}{T_{p} + F_{n}},\\ \text{F1} &= \frac{2 \times \text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}}\\ \text{Balanced-Accuracy (bACC)} &= \frac{1}{2}\left( \frac{T_{p}}{T_{p} + F_{n}} + \frac{T_{n}}{T_{n} + F_{p}}\right ) \end{align}The accuracy can be a misleading metric for imbalanced data sets. In these cases, a balanced accuracy (bACC) [4] is recommended that normalizes true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum by two.
Header('Gradient Boosting Classifier with Default Parameters')
n_splits = 20
GBC= GradientBoostingClassifier()
print('Default Parameters = %s' % GBC.get_params(deep=True))
_ = GBC.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(GBC, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
PD = dict(FS = (10, 5), annot_kws = 14, shrink = .6, Labels = list(Labels_dict.values()))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
Header('Train Set', C = 'Green')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Gradient Boosting Classifier with Default Parameters =============================================== Default Parameters = {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.9346 ± 0.0097 | 0.9744 ± 0.0054 | 0.9541 ± 0.0055 | 350.0000 ± 0.0000 |
| Diabetic | 0.9481 ± 0.0102 | 0.8722 ± 0.0204 | 0.9084 ± 0.0121 | 187.0000 ± 0.0000 |
| accuracy | 0.9388 ± 0.0075 | 0.9388 ± 0.0075 | 0.9388 ± 0.0075 | 0.9388 ± 0.0075 |
| macro avg | 0.9414 ± 0.0071 | 0.9233 ± 0.0102 | 0.9312 ± 0.0088 | 537.0000 ± 0.0000 |
| weighted avg | 0.9393 ± 0.0073 | 0.9388 ± 0.0075 | 0.9382 ± 0.0078 | 537.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.7965 ± 0.0183 | 0.8340 ± 0.0275 | 0.8145 ± 0.0157 | 150.0000 ± 0.0000 |
| Diabetic | 0.6644 ± 0.0355 | 0.6043 ± 0.0469 | 0.6317 ± 0.0318 | 81.0000 ± 0.0000 |
| accuracy | 0.7535 ± 0.0198 | 0.7535 ± 0.0198 | 0.7535 ± 0.0198 | 0.7535 ± 0.0198 |
| macro avg | 0.7305 ± 0.0229 | 0.7192 ± 0.0228 | 0.7231 ± 0.0223 | 231.0000 ± 0.0000 |
| weighted avg | 0.7502 ± 0.0202 | 0.7535 ± 0.0198 | 0.7504 ± 0.0199 | 231.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.95 Recall (Train) = 0.87 TPR (Train) = 0.87 TNR (Train) = 0.97 Balanced Accuracy (Train) = 0.92 Test Set =========================================================================================== Precision (Test) = 0.66 Recall (Test) = 0.60 TPR (Test) = 0.60 TNR (Test) = 0.83 Balanced Accuracy (Test) = 0.72 ====================================================================================================
In order to find the parameters for our model, we can sue RandomizedSearchCV. Here, we have defined a function Best_Parm to find the best parameters.
# KNeighborsClassifier
GBC= GradientBoostingClassifier()
param_dist = dict(loss = ['deviance', 'exponential'],
learning_rate = [.8, .9, 1.0],
n_estimators= [100, 200, 1000],
max_leaf_nodes = [None, 2, 3])
Header('Gradient Boosting Classifier with the Best Parameters')
grid = Best_Parm(model = GBC, param_dist = param_dist, Top = 20, H = 750, titleY =.96)
Gradient Boosting Classifier with the Best Parameters ==============================================
| Rank Test Score | Params | Train Score | Test Score | Fit Time |
|---|---|---|---|---|
| 1 | n_estimators: 100, max_leaf_nodes: 2, loss: exponential, learning_rate: 0.8 | 8.11e-01 ± 1.61e-02 | 6.65e-01 ± 3.64e-02 | 7.43e-02 ± 1.61e-03 |
| 2 | n_estimators: 100, max_leaf_nodes: 2, loss: exponential, learning_rate: 0.9 | 8.14e-01 ± 1.91e-02 | 6.62e-01 ± 3.66e-02 | 7.64e-02 ± 2.35e-03 |
| 3 | n_estimators: 100, max_leaf_nodes: 2, loss: exponential, learning_rate: 1.0 | 8.21e-01 ± 2.23e-02 | 6.58e-01 ± 3.52e-02 | 7.32e-02 ± 2.14e-03 |
| 4 | n_estimators: 100, max_leaf_nodes: 2, loss: deviance, learning_rate: 0.8 | 8.33e-01 ± 2.16e-02 | 6.56e-01 ± 3.80e-02 | 6.52e-02 ± 3.60e-03 |
| 5 | n_estimators: 100, max_leaf_nodes: 2, loss: deviance, learning_rate: 0.9 | 8.37e-01 ± 2.56e-02 | 6.54e-01 ± 3.53e-02 | 7.36e-02 ± 1.69e-03 |
| 6 | n_estimators: 200, max_leaf_nodes: 2, loss: exponential, learning_rate: 0.8 | 8.43e-01 ± 1.88e-02 | 6.53e-01 ± 3.41e-02 | 1.46e-01 ± 4.25e-03 |
| 7 | n_estimators: 200, max_leaf_nodes: 2, loss: deviance, learning_rate: 0.8 | 8.71e-01 ± 2.04e-02 | 6.51e-01 ± 3.20e-02 | 1.24e-01 ± 6.46e-03 |
| 8 | n_estimators: 100, max_leaf_nodes: 2, loss: deviance, learning_rate: 1.0 | 8.45e-01 ± 2.85e-02 | 6.50e-01 ± 3.12e-02 | 7.42e-02 ± 2.04e-03 |
| 9 | n_estimators: 200, max_leaf_nodes: 2, loss: exponential, learning_rate: 0.9 | 8.56e-01 ± 1.74e-02 | 6.48e-01 ± 4.12e-02 | 1.50e-01 ± 3.92e-03 |
| 10 | n_estimators: 200, max_leaf_nodes: 2, loss: deviance, learning_rate: 0.9 | 8.82e-01 ± 1.51e-02 | 6.48e-01 ± 3.44e-02 | 1.43e-01 ± 2.31e-03 |
| 11 | n_estimators: 200, max_leaf_nodes: 2, loss: exponential, learning_rate: 1.0 | 8.62e-01 ± 1.67e-02 | 6.46e-01 ± 3.86e-02 | 1.45e-01 ± 3.43e-03 |
| 12 | n_estimators: 200, max_leaf_nodes: 2, loss: deviance, learning_rate: 1.0 | 8.82e-01 ± 3.49e-02 | 6.45e-01 ± 3.66e-02 | 1.47e-01 ± 3.51e-03 |
| 13 | n_estimators: 100, max_leaf_nodes: 3, loss: exponential, learning_rate: 0.8 | 9.69e-01 ± 8.44e-03 | 6.42e-01 ± 4.37e-02 | 8.60e-02 ± 2.09e-03 |
| 14 | n_estimators: 100, max_leaf_nodes: 3, loss: exponential, learning_rate: 0.9 | 9.75e-01 ± 9.82e-03 | 6.41e-01 ± 2.88e-02 | 8.64e-02 ± 2.31e-03 |
| 15 | n_estimators: 100, max_leaf_nodes: 3, loss: exponential, learning_rate: 1.0 | 9.85e-01 ± 8.12e-03 | 6.32e-01 ± 4.17e-02 | 8.63e-02 ± 3.38e-03 |
| 16 | n_estimators: 200, max_leaf_nodes: 3, loss: exponential, learning_rate: 0.9 | 1.00e+00 ± 1.16e-03 | 6.32e-01 ± 4.18e-02 | 1.68e-01 ± 3.48e-03 |
| 17 | n_estimators: 200, max_leaf_nodes: 3, loss: exponential, learning_rate: 0.8 | 1.00e+00 ± 0.00e+00 | 6.32e-01 ± 4.16e-02 | 1.69e-01 ± 3.62e-03 |
| 18 | n_estimators: 100, max_leaf_nodes: None, loss: exponential, learning_rate: 0.8 | 1.00e+00 ± 0.00e+00 | 6.28e-01 ± 3.96e-02 | 9.91e-02 ± 8.77e-03 |
| 19 | n_estimators: 200, max_leaf_nodes: None, loss: exponential, learning_rate: 1.0 | 1.00e+00 ± 0.00e+00 | 6.26e-01 ± 4.60e-02 | 2.27e-01 ± 5.19e-03 |
| 20 | n_estimators: 100, max_leaf_nodes: None, loss: exponential, learning_rate: 0.9 | 1.00e+00 ± 0.00e+00 | 6.25e-01 ± 4.36e-02 | 1.19e-01 ± 3.20e-03 |
Since we have identified the best parameters for our modeling, we train another model using these parameters.
Header('Gradient Boosting Classifier with the Best Parameters')
GBC = GradientBoostingClassifier(**grid.best_params_)
print('Default Parameters = %s' % GBC.get_params(deep=True))
_ = GBC.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(GBC, X = X, y = y, n_splits = 20)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'DarkGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'MediumBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
Header('Train Set', C = 'Green')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Gradient Boosting Classifier with the Best Parameters ============================================== Default Parameters = {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.8, 'loss': 'exponential', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': 2, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.8654 ± 0.0079 | 0.9081 ± 0.0095 | 0.8862 ± 0.0065 | 350.0000 ± 0.0000 |
| Diabetic | 0.8108 ± 0.0161 | 0.7356 ± 0.0179 | 0.7712 ± 0.0134 | 187.0000 ± 0.0000 |
| accuracy | 0.8480 ± 0.0086 | 0.8480 ± 0.0086 | 0.8480 ± 0.0086 | 0.8480 ± 0.0086 |
| macro avg | 0.8381 ± 0.0101 | 0.8219 ± 0.0100 | 0.8287 ± 0.0098 | 537.0000 ± 0.0000 |
| weighted avg | 0.8464 ± 0.0088 | 0.8480 ± 0.0086 | 0.8462 ± 0.0088 | 537.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.7927 ± 0.0201 | 0.8373 ± 0.0276 | 0.8140 ± 0.0166 | 150.0000 ± 0.0000 |
| Diabetic | 0.6646 ± 0.0364 | 0.5932 ± 0.0516 | 0.6255 ± 0.0361 | 81.0000 ± 0.0000 |
| accuracy | 0.7517 ± 0.0214 | 0.7517 ± 0.0214 | 0.7517 ± 0.0214 | 0.7517 ± 0.0214 |
| macro avg | 0.7286 ± 0.0245 | 0.7153 ± 0.0254 | 0.7198 ± 0.0249 | 231.0000 ± 0.0000 |
| weighted avg | 0.7478 ± 0.0221 | 0.7517 ± 0.0214 | 0.7479 ± 0.0219 | 231.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.81 Recall (Train) = 0.74 TPR (Train) = 0.74 TNR (Train) = 0.91 Balanced Accuracy (Train) = 0.82 Test Set =========================================================================================== Precision (Test) = 0.67 Recall (Test) = 0.59 TPR (Test) = 0.59 TNR (Test) = 0.84 Balanced Accuracy (Test) = 0.72 ====================================================================================================
df = pd.DataFrame()
for n in range(4, X.shape[1]+1):
selector = RFE(estimator= GBC, n_features_to_select=n, verbose=0)
selector.fit(X_train, y_train)
df = df.append({'Number of Features to Select': n,
'Train F1 Score': metrics.f1_score(y_train, selector.predict(X_train)),
'Test F1 Score': metrics.f1_score(y_test, selector.predict(X_test)),
'Train Recall Score': metrics.recall_score(y_train, selector.predict(X_train)),
'Test Recall Score': metrics.recall_score(y_test, selector.predict(X_test)),
# 'Features': X.columns[selector.support_].tolist(),
'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)
df['Number of Features to Select'] = df['Number of Features to Select'].astype(int)
df = df.sort_values(by = ['Test Recall Score', 'Test F1 Score'], ascending=False)
Best_Features = df['Best Features'][0]
display(df.style.hide_index().set_precision(4).\
set_properties(subset=['Best Features'], **{'background-color': 'Lavender', 'color': 'Black'}))
| Best Features | Number of Features to Select | Test F1 Score | Test Recall Score | Train F1 Score | Train Recall Score |
|---|---|---|---|---|---|
| ['Pregnancies', 'Glucose', 'Insulin', 'BMI', 'Diabetes Pedigree Function', 'Age'] | 6 | 0.5811 | 0.5309 | 0.7594 | 0.7005 |
| ['Pregnancies', 'Glucose', 'BMI', 'Diabetes Pedigree Function', 'Age'] | 5 | 0.5772 | 0.5309 | 0.7429 | 0.6952 |
| ['Glucose', 'BMI', 'Diabetes Pedigree Function', 'Age'] | 4 | 0.5753 | 0.5185 | 0.7500 | 0.7059 |
| ['Pregnancies', 'Glucose', 'Blood Pressure', 'Insulin', 'BMI', 'Diabetes Pedigree Function', 'Age'] | 7 | 0.5714 | 0.5185 | 0.7644 | 0.7112 |
| ['Pregnancies', 'Glucose', 'Blood Pressure', 'Skin Thickness', 'Insulin', 'BMI', 'Diabetes Pedigree Function', 'Age'] | 8 | 0.5556 | 0.4938 | 0.7816 | 0.7273 |
Header('Gradient Boosting Classifier with the Best Parameters and Feature Ranking')
GBC = GradientBoostingClassifier(**grid.best_params_)
print('Default Parameters = %s' % GBC.get_params(deep=True))
_ = GBC.fit(X_train[Best_Features], y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(GBC, X = X[Best_Features], y = y, n_splits = 20)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'DarkGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'MediumBlue', 'color': 'White'}))
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = n_splits)
Header('Train Set', C = 'Green')
tn, fp, fn, tp = CM_Train.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
print('Precision (Train) = %.2f' % Precision)
print('Recall (Train) = %.2f' % Recall)
print('TPR (Train) = %.2f' % TPR)
print('TNR (Train) = %.2f' % TNR)
print('Balanced Accuracy (Train) = %.2f' % BA)
Header('Test Set')
tn, fp, fn, tp = CM_Test.ravel()
Precision = tp/(tp+fp)
Recall = tp/(tp + fn)
TPR = tp/(tp +fn)
TNR = tn/(tn +fp)
BA = (TPR + TNR)/2
PPCR = (tp + fp)/(tp + fp + tn+ fn)
print('Precision (Test) = %.2f' % Precision)
print('Recall (Test) = %.2f' % Recall)
print('TPR (Test) = %.2f' % TPR)
print('TNR (Test) = %.2f' % TNR)
print('Balanced Accuracy (Test) = %.2f' % BA)
del tn, fp, fn, tp, Precision, Recall, TPR, TNR, BA
Line()
Gradient Boosting Classifier with the Best Parameters and Feature Ranking ========================== Default Parameters = {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.8, 'loss': 'exponential', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': 2, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.8571 ± 0.0073 | 0.8984 ± 0.0107 | 0.8772 ± 0.0070 | 350.0000 ± 0.0000 |
| Diabetic | 0.7913 ± 0.0179 | 0.7195 ± 0.0164 | 0.7536 ± 0.0135 | 187.0000 ± 0.0000 |
| accuracy | 0.8361 ± 0.0091 | 0.8361 ± 0.0091 | 0.8361 ± 0.0091 | 0.8361 ± 0.0091 |
| macro avg | 0.8242 ± 0.0110 | 0.8090 ± 0.0099 | 0.8154 ± 0.0101 | 537.0000 ± 0.0000 |
| weighted avg | 0.8342 ± 0.0093 | 0.8361 ± 0.0091 | 0.8342 ± 0.0091 | 537.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| Non-Diabetic | 0.7990 ± 0.0235 | 0.8353 ± 0.0262 | 0.8164 ± 0.0178 | 150.0000 ± 0.0000 |
| Diabetic | 0.6672 ± 0.0354 | 0.6093 ± 0.0579 | 0.6356 ± 0.0408 | 81.0000 ± 0.0000 |
| accuracy | 0.7561 ± 0.0236 | 0.7561 ± 0.0236 | 0.7561 ± 0.0236 | 0.7561 ± 0.0236 |
| macro avg | 0.7331 ± 0.0262 | 0.7223 ± 0.0292 | 0.7260 ± 0.0280 | 231.0000 ± 0.0000 |
| weighted avg | 0.7528 ± 0.0245 | 0.7561 ± 0.0236 | 0.7530 ± 0.0246 | 231.0000 ± 0.0000 |
Train Set ========================================================================================== Precision (Train) = 0.79 Recall (Train) = 0.72 TPR (Train) = 0.72 TNR (Train) = 0.90 Balanced Accuracy (Train) = 0.81 Test Set =========================================================================================== Precision (Test) = 0.66 Recall (Test) = 0.60 TPR (Test) = 0.60 TNR (Test) = 0.83 Balanced Accuracy (Test) = 0.72 ====================================================================================================
In this article, we tuned a GBC model with the best parameres and selected fewer features as the best features. However, the model accuracy didn't improve from that of the initial model.